setwd("C:/R_DS")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## Warning: package 'readr' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
library(dplyr)
#reading url using read_html
corona_rul <- read_html("https://www.worldometers.info/coronavirus/")
#reading table from the website
corona_file <- html_table(corona_rul)
View(corona_file)
corona_file_new <- corona_file[[1]]
#View(corona_file_new)
str(corona_file_new)
## 'data.frame': 237 obs. of 19 variables:
## $ # : int NA NA NA NA NA NA NA NA 1 2 ...
## $ Country,Other : chr "North America" "South America" "Asia" "Europe" ...
## $ TotalCases : chr "36,190,724" "21,586,450" "29,439,551" "40,500,869" ...
## $ NewCases : chr "+8,613" "+467" "+104,476" "+60,062" ...
## $ TotalDeaths : chr "824,771" "565,299" "435,292" "927,775" ...
## $ NewDeaths : chr "+210" "+8" "+877" "+1,387" ...
## $ TotalRecovered : chr "27,898,884" "19,239,136" "26,705,986" "28,622,164" ...
## $ NewRecovered : chr "+4,273" "+450" "+66,729" "+82,655" ...
## $ ActiveCases : chr "7,467,069" "1,782,015" "2,298,273" "10,950,930" ...
## $ Serious,Critical : chr "15,433" "21,392" "26,450" "31,600" ...
## $ Tot Cases/1M pop : chr "" "" "" "" ...
## $ Deaths/1M pop : chr "" "" "" "" ...
## $ TotalTests : chr "" "" "" "" ...
## $ Tests/1M pop : chr "" "" "" "" ...
## $ Population : chr "" "" "" "" ...
## $ Continent : chr "North America" "South America" "Asia" "Europe" ...
## $ 1 Caseevery X ppl : chr "" "" "" "" ...
## $ 1 Deathevery X ppl: chr "" "" "" "" ...
## $ 1 Testevery X ppl : chr "" "" "" "" ...
#writing the data to csv file
write.table(corona_file_new, file = "corona_file.csv",
sep = ",",
row.names = FALSE)
corona_file_new <- read.csv("C:/R_DS/corona_file.csv")
corona_data <- data.frame(corona_file_new[-1:-8,])
head(corona_data)
## X. Country.Other TotalCases NewCases TotalDeaths NewDeaths TotalRecovered
## 9 1 USA 31,425,966 +5,635 568,834 +57 23,946,970
## 10 2 Brazil 12,984,956 331,530 11,357,521
## 11 3 India 12,625,146 +37,226 165,293 +161 11,698,657
## 12 4 France 4,822,470 96,678 299,624
## 13 5 Russia 4,589,540 +8,646 100,717 +343 4,211,133
## 14 6 UK 4,362,150 +2,762 126,862 +26 3,912,562
## NewRecovered ActiveCases Serious.Critical Tot.Cases.1M.pop Deaths.1M.pop
## 9 +267 6,910,162 8,716 94,522 1,711
## 10 1,295,905 8,318 60,761 1,551
## 11 +18,699 761,196 8,944 9,081 119
## 12 4,426,168 5,341 73,757 1,479
## 13 +7,052 277,690 2,300 31,439 690
## 14 +10,920 322,726 517 64,002 1,861
## TotalTests Tests.1M.pop Population Continent X1.Caseevery.X.ppl
## 9 409,404,894 1,231,390 332,473,823 North America 11
## 10 28,600,000 133,830 213,704,094 South America 16
## 11 249,019,657 179,116 1,390,271,710 Asia 110
## 12 66,728,544 1,020,577 65,383,138 Europe 14
## 13 121,900,000 835,035 145,981,988 Europe 32
## 14 127,546,869 1,871,386 68,156,365 Europe 16
## X1.Deathevery.X.ppl X1.Testevery.X.ppl
## 9 584 1
## 10 645 7
## 11 8,411 6
## 12 676 1
## 13 1,449 1
## 14 537 1
corona_data <- data.frame(corona_file_new[ , -1 ])
corona_data <- data.frame(corona_file_new[c(-1:-8,-228 :-236), ])
#View(corona_data)
#chaning colnmaes by using rename
corona_data_updated <- corona_data %>%
rename(S.No. ="X." , Country_Other = "Country.Other",
Serious_Critical = "Serious.Critical",
Tot_Cases_1M_pop = "Tot.Cases.1M.pop",
Deaths_1M_pop = "Deaths.1M.pop",
Tests_1M_pop ="Tests.1M.pop" ,
X1_Caseevery_X_ppl = "X1.Caseevery.X.ppl",
X1_Deathevery_X_ppl = "X1.Deathevery.X.ppl",
X1_Testevery_X_ppl = "X1.Testevery.X.ppl", )
head(corona_data_updated)
## S.No. Country_Other TotalCases NewCases TotalDeaths NewDeaths TotalRecovered
## 9 1 USA 31,425,966 +5,635 568,834 +57 23,946,970
## 10 2 Brazil 12,984,956 331,530 11,357,521
## 11 3 India 12,625,146 +37,226 165,293 +161 11,698,657
## 12 4 France 4,822,470 96,678 299,624
## 13 5 Russia 4,589,540 +8,646 100,717 +343 4,211,133
## 14 6 UK 4,362,150 +2,762 126,862 +26 3,912,562
## NewRecovered ActiveCases Serious_Critical Tot_Cases_1M_pop Deaths_1M_pop
## 9 +267 6,910,162 8,716 94,522 1,711
## 10 1,295,905 8,318 60,761 1,551
## 11 +18,699 761,196 8,944 9,081 119
## 12 4,426,168 5,341 73,757 1,479
## 13 +7,052 277,690 2,300 31,439 690
## 14 +10,920 322,726 517 64,002 1,861
## TotalTests Tests_1M_pop Population Continent X1_Caseevery_X_ppl
## 9 409,404,894 1,231,390 332,473,823 North America 11
## 10 28,600,000 133,830 213,704,094 South America 16
## 11 249,019,657 179,116 1,390,271,710 Asia 110
## 12 66,728,544 1,020,577 65,383,138 Europe 14
## 13 121,900,000 835,035 145,981,988 Europe 32
## 14 127,546,869 1,871,386 68,156,365 Europe 16
## X1_Deathevery_X_ppl X1_Testevery_X_ppl
## 9 584 1
## 10 645 7
## 11 8,411 6
## 12 676 1
## 13 1,449 1
## 14 537 1
#View(corona_data_updated)
#corona_data %>% select(-NewCases,-NewDeaths,-NewRecovered )
#View(corona_data_updated)
#2nd method starts_with() to remove colums
corona_data_updated <- corona_data_updated %>% dplyr:: select(-starts_with("New"))
head(corona_data_updated)
## S.No. Country_Other TotalCases TotalDeaths TotalRecovered ActiveCases
## 9 1 USA 31,425,966 568,834 23,946,970 6,910,162
## 10 2 Brazil 12,984,956 331,530 11,357,521 1,295,905
## 11 3 India 12,625,146 165,293 11,698,657 761,196
## 12 4 France 4,822,470 96,678 299,624 4,426,168
## 13 5 Russia 4,589,540 100,717 4,211,133 277,690
## 14 6 UK 4,362,150 126,862 3,912,562 322,726
## Serious_Critical Tot_Cases_1M_pop Deaths_1M_pop TotalTests Tests_1M_pop
## 9 8,716 94,522 1,711 409,404,894 1,231,390
## 10 8,318 60,761 1,551 28,600,000 133,830
## 11 8,944 9,081 119 249,019,657 179,116
## 12 5,341 73,757 1,479 66,728,544 1,020,577
## 13 2,300 31,439 690 121,900,000 835,035
## 14 517 64,002 1,861 127,546,869 1,871,386
## Population Continent X1_Caseevery_X_ppl X1_Deathevery_X_ppl
## 9 332,473,823 North America 11 584
## 10 213,704,094 South America 16 645
## 11 1,390,271,710 Asia 110 8,411
## 12 65,383,138 Europe 14 676
## 13 145,981,988 Europe 32 1,449
## 14 68,156,365 Europe 16 537
## X1_Testevery_X_ppl
## 9 1
## 10 7
## 11 6
## 12 1
## 13 1
## 14 1
#View(corona_data_updated)
library(dplyr)
#removing commas in the data
set.seed(1)
mysub <- function(x) {
gsub(",","",x)
}
#APPLYING mysub function to all applicable columns in the dataset
corona_data_updated[,3:12:15] <- apply(corona_data_updated[,3:12:15],MARGIN=2, FUN= mysub )
## Warning in 3:12:15: numerical expression has 10 elements: only the first used
## Warning in 3:12:15: numerical expression has 10 elements: only the first used
View(corona_data_updated)
str(corona_data_updated)
## 'data.frame': 220 obs. of 16 variables:
## $ S.No. : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Country_Other : chr "USA" "Brazil" "India" "France" ...
## $ TotalCases : chr "31425966" "12984956" "12625146" "4822470" ...
## $ TotalDeaths : chr "568834" "331530" "165293" "96678" ...
## $ TotalRecovered : chr "23946970" "11357521" "11698657" "299624" ...
## $ ActiveCases : chr "6910162" "1295905" "761196" "4426168" ...
## $ Serious_Critical : chr "8716" "8318" "8944" "5341" ...
## $ Tot_Cases_1M_pop : chr "94522" "60761" "9081" "73757" ...
## $ Deaths_1M_pop : chr "1711" "1551" "119" "1479" ...
## $ TotalTests : chr "409404894" "28600000" "249019657" "66728544" ...
## $ Tests_1M_pop : chr "1231390" "133830" "179116" "1020577" ...
## $ Population : chr "332473823" "213704094" "1390271710" "65383138" ...
## $ Continent : chr "North America" "South America" "Asia" "Europe" ...
## $ X1_Caseevery_X_ppl : chr "11" "16" "110" "14" ...
## $ X1_Deathevery_X_ppl: chr "584" "645" "8411" "676" ...
## $ X1_Testevery_X_ppl : chr "1" "7" "6" "1" ...
#converting chr type to num using lapply
colms <- c(3:12)
corona_data_updated[colms] <- lapply(corona_data_updated[colms], as.numeric)
## Warning in lapply(corona_data_updated[colms], as.numeric): NAs introduced by
## coercion
## Warning in lapply(corona_data_updated[colms], as.numeric): NAs introduced by
## coercion
corona_data_updated[14:16] <- lapply(corona_data_updated[14:16], as.numeric)
## Warning in lapply(corona_data_updated[14:16], as.numeric): NAs introduced by
## coercion
## 'data.frame': 220 obs. of 16 variables:
## $ S.No. : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Country_Other : chr "USA" "Brazil" "India" "France" ...
## $ TotalCases : num 31425966 12984956 12625146 4822470 4589540 ...
## $ TotalDeaths : num 568834 331530 165293 96678 100717 ...
## $ TotalRecovered : num 23946970 11357521 11698657 299624 4211133 ...
## $ ActiveCases : num 6910162 1295905 761196 4426168 277690 ...
## $ Serious_Critical : num 8716 8318 8944 5341 2300 ...
## $ Tot_Cases_1M_pop : num 94522 60761 9081 73757 31439 ...
## $ Deaths_1M_pop : num 1711 1551 119 1479 690 ...
## $ TotalTests : num 4.09e+08 2.86e+07 2.49e+08 6.67e+07 1.22e+08 ...
## $ Tests_1M_pop : num 1231390 133830 179116 1020577 835035 ...
## $ Population : num 3.32e+08 2.14e+08 1.39e+09 6.54e+07 1.46e+08 ...
## $ Continent : chr "North America" "South America" "Asia" "Europe" ...
## $ X1_Caseevery_X_ppl : num 11 16 110 14 32 16 16 24 14 29 ...
## $ X1_Deathevery_X_ppl: num 584 645 8411 676 1449 ...
## $ X1_Testevery_X_ppl : num 1 7 6 1 1 1 1 2 1 2 ...
summary(corona_data_updated)
## S.No. Country_Other TotalCases TotalDeaths
## Min. : 1.0 Length:220 Min. : 3 Min. : 1
## 1st Qu.: 55.5 Class :character 1st Qu.: 4286 1st Qu.: 93
## Median :110.0 Mode :character Median : 38008 Median : 748
## Mean :110.0 Mean : 1200352 Mean : 27963
## 3rd Qu.:164.5 3rd Qu.: 262992 3rd Qu.: 6083
## Max. :219.0 Max. :132083861 Max. :2868477
## NA's :1 NA's :15
## TotalRecovered ActiveCases Serious_Critical Tot_Cases_1M_pop
## Min. : 1 Min. : 0 Min. : 1.0 Min. : 7
## 1st Qu.: 3024 1st Qu.: 255 1st Qu.: 13.0 1st Qu.: 1724
## Median : 27074 Median : 2952 Median : 74.0 Median : 14607
## Mean : 975276 Mean : 200203 Mean : 1360.6 Mean : 28138
## 3rd Qu.: 222100 3rd Qu.: 26488 3rd Qu.: 522.5 3rd Qu.: 49536
## Max. :106359121 Max. :22856263 Max. :97962.0 Max. :158818
## NA's :2 NA's :2 NA's :76 NA's :2
## Deaths_1M_pop TotalTests Tests_1M_pop Population
## Min. : 0.3 Min. : 470 Min. : 688 Min. :8.030e+02
## 1st Qu.: 33.5 1st Qu.: 110172 1st Qu.: 47114 1st Qu.:6.560e+05
## Median : 215.0 Median : 829634 Median : 232236 Median :6.605e+06
## Mean : 498.5 Mean : 8605639 Mean : 542121 Mean :2.939e+07
## 3rd Qu.: 769.5 3rd Qu.: 4051128 3rd Qu.: 606038 3rd Qu.:2.385e+07
## Max. :2791.0 Max. :409404894 Max. :6392809 Max. :1.390e+09
## NA's :17 NA's :14 NA's :14 NA's :3
## Continent X1_Caseevery_X_ppl X1_Deathevery_X_ppl X1_Testevery_X_ppl
## Length:220 Min. : 6 Min. : 358 Min. : 0.00
## Class :character 1st Qu.: 20 1st Qu.: 1286 1st Qu.: 2.00
## Mode :character Median : 70 Median : 4724 Median : 4.00
## Mean : 3376 Mean : 90261 Mean : 25.93
## 3rd Qu.: 581 3rd Qu.: 30098 3rd Qu.: 21.00
## Max. :150108 Max. :2905295 Max. :665.00
## NA's :3 NA's :18 NA's :15
library(tidyverse)
corona_data_updated <- na.omit(corona_data_updated)
summary(corona_data_updated)
## S.No. Country_Other TotalCases TotalDeaths
## Min. : 1.00 Length:139 Min. : 420 Min. : 4
## 1st Qu.: 39.50 Class :character 1st Qu.: 17899 1st Qu.: 175
## Median : 83.00 Mode :character Median : 117757 Median : 1662
## Mean : 85.48 Mean : 897895 Mean : 19713
## 3rd Qu.:126.50 3rd Qu.: 482413 3rd Qu.: 9340
## Max. :198.00 Max. :31425966 Max. :568834
## TotalRecovered ActiveCases Serious_Critical Tot_Cases_1M_pop
## Min. : 44 Min. : 8 Min. : 1.0 Min. : 203
## 1st Qu.: 13459 1st Qu.: 1302 1st Qu.: 13.0 1st Qu.: 8014
## Median : 88585 Median : 11854 Median : 72.0 Median : 23313
## Mean : 731429 Mean : 146753 Mean : 696.9 Mean : 35945
## 3rd Qu.: 362899 3rd Qu.: 43792 3rd Qu.: 504.5 3rd Qu.: 59481
## Max. :23946970 Max. :6910162 Max. :8944.0 Max. :158818
## Deaths_1M_pop TotalTests Tests_1M_pop Population
## Min. : 1.0 Min. : 1331 Min. : 3682 Min. :1.109e+04
## 1st Qu.: 92.5 1st Qu.: 225622 1st Qu.: 93658 1st Qu.:1.808e+06
## Median : 331.0 Median : 1377915 Median : 284796 Median :7.199e+06
## Mean : 611.0 Mean : 11860445 Mean : 548750 Mean :3.636e+07
## 3rd Qu.: 944.5 3rd Qu.: 7881671 3rd Qu.: 672932 3rd Qu.:3.300e+07
## Max. :2523.0 Max. :409404894 Max. :4597508 Max. :1.390e+09
## Continent X1_Caseevery_X_ppl X1_Deathevery_X_ppl X1_Testevery_X_ppl
## Length:139 Min. : 6.0 Min. : 396 Min. : 0.00
## Class :character 1st Qu.: 17.0 1st Qu.: 1058 1st Qu.: 1.50
## Mode :character Median : 43.0 Median : 3021 Median : 4.00
## Mean : 220.7 Mean : 20018 Mean : 15.94
## 3rd Qu.: 126.0 3rd Qu.: 10840 3rd Qu.: 11.00
## Max. :4932.0 Max. :736136 Max. :272.00
#View(corona_data_updated)
corona_data_updated$Continent <- as.factor(corona_data_updated$Continent)
str(corona_data_updated$Continent)
## Factor w/ 6 levels "Africa","Asia",..: 5 6 2 4 4 4 4 2 4 4 ...
data1<-corona_data_updated
df <-corona_data_updated
library(forcats)
library(tidyverse)
library(ggplot2)
#Which continent is having the maximum number of cases?
continent_TotalCases <- corona_data_updated %>% mutate(Continent = fct_lump(Continent, n=5 )) %>%
group_by(Continent) %>%
summarise(TotalCases = max(TotalCases, na.rm=TRUE))%>% arrange(desc(TotalCases))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 2
## Continent TotalCases
## <fct> <dbl>
## 1 North America 31425966
## 2 South America 12984956
## 3 Asia 12625146
## 4 Europe 4822470
## 5 Africa 1551964
## 6 Australia/Oceania 18633
# #plot
# continent_TotalCases <- corona_data_updated %>% mutate(Continent = fct_lump(Continent, n=5 )) %>%
# group_by(Continent) %>%
# summarise(TotalCases=max(TotalCases, na.rm=TRUE)) %>%
# ggplot(aes(x=Continent, y= TotalCases))+geom_col()
#
# Which continent has the least number of deaths?
library(dplyr)
continent_leastdeaths <- corona_data_updated %>% mutate(Continent = fct_lump(Continent, n=5 )) %>%
group_by(Continent) %>% dplyr::summarise(TotalDeaths = min(TotalDeaths, na.rm=TRUE)) %>% arrange(TotalDeaths)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 2
## Continent TotalDeaths
## <fct> <dbl>
## 1 Australia/Oceania 4
## 2 North America 10
## 3 Asia 12
## 4 Europe 29
## 5 Africa 66
## 6 South America 93
# What is the current status of China?
#There no data for China as it removed in the process of removing NAs
#I have taken 3 different countries for this question
library(dplyr)
# current status of Brazil
current_status_Brazil <- corona_data_updated %>%
filter(corona_data_updated$Country_Other == "Brazil") %>%
dplyr::summarise(Country_Other, TotalCases=max(TotalCases, na.rm=TRUE), TotalDeaths=max(TotalDeaths, na.rm=TRUE),ActiveCases=max(ActiveCases, na.rm=TRUE),Serious_Critical=max(Serious_Critical, na.rm=TRUE))
current_status_Brazil
## Country_Other TotalCases TotalDeaths ActiveCases Serious_Critical
## 1 Brazil 12984956 331530 1295905 8318
# current status of USA
current_status_USA <- corona_data_updated %>%
filter(corona_data_updated$Country_Other == "USA") %>%
dplyr::summarise(Country_Other, TotalCases=max(TotalCases, na.rm=TRUE), TotalDeaths=max(TotalDeaths, na.rm=TRUE),ActiveCases=max(ActiveCases, na.rm=TRUE),Serious_Critical=max(Serious_Critical, na.rm=TRUE))
current_status_USA
## Country_Other TotalCases TotalDeaths ActiveCases Serious_Critical
## 1 USA 31425966 568834 6910162 8716
# current status of India
current_status_India <- corona_data_updated %>%
filter(corona_data_updated$Country_Other == "India") %>%
dplyr::summarise(Country_Other, TotalCases=max(TotalCases, na.rm=TRUE), TotalDeaths=max(TotalDeaths, na.rm=TRUE),ActiveCases=max(ActiveCases, na.rm=TRUE),Serious_Critical=max(Serious_Critical, na.rm=TRUE))
current_status_India
## Country_Other TotalCases TotalDeaths ActiveCases Serious_Critical
## 1 India 12625146 165293 761196 8944
#************** What is the current status of China?
#***************** Method-2 using TotalDeaths
#There no data for China as it removed in the process of removing NAs
#I have taken 3 different countries for this question
#If TotalDeaths is more than the mean of the value the rank allotted is "1" which indicate its in dangerous situation , if less than mean then rank allotted as "0" which indicates the country comes under safe zone.
library(dplyr)
TotalDeaths_status <- corona_data_updated %>% dplyr::summarise(TotalDeaths = mean(TotalDeaths, na.rm=TRUE))
#TotalDeaths_status
current_status <- mutate(corona_data_updated, rank = ifelse(corona_data_updated$TotalDeaths >= 13235, "1", "0"))
current_status$rank <- as.factor(current_status$rank)
current_status_country <- current_status %>% filter(current_status$rank== 1)%>% select(Country_Other,TotalCases, TotalDeaths )
#current_status_country
final_satus <- function(x){
ifelse(x %in% current_status_country$Country_Other, "The country is in **Dangerous situation**", "The country is Safe")
}
#status of different countries
final_satus("USA")
## [1] "The country is in **Dangerous situation**"
## [1] "The country is Safe"
## [1] "The country is in **Dangerous situation**"
## [1] "The country is in **Dangerous situation**"
## [1] "The country is Safe"
# Please arrange all data based on the total number of cases per million population?
cases_per_million_population <- corona_data_updated %>%
mutate(corona_data_updated$TotalCases / (corona_data_updated$Population /1000000 )) %>%
arrange(desc(TotalCases))
head(cases_per_million_population)
## S.No. Country_Other TotalCases TotalDeaths TotalRecovered ActiveCases
## 1 1 USA 31425966 568834 23946970 6910162
## 2 2 Brazil 12984956 331530 11357521 1295905
## 3 3 India 12625146 165293 11698657 761196
## 4 4 France 4822470 96678 299624 4426168
## 5 5 Russia 4589540 100717 4211133 277690
## 6 6 UK 4362150 126862 3912562 322726
## Serious_Critical Tot_Cases_1M_pop Deaths_1M_pop TotalTests Tests_1M_pop
## 1 8716 94522 1711 409404894 1231390
## 2 8318 60761 1551 28600000 133830
## 3 8944 9081 119 249019657 179116
## 4 5341 73757 1479 66728544 1020577
## 5 2300 31439 690 121900000 835035
## 6 517 64002 1861 127546869 1871386
## Population Continent X1_Caseevery_X_ppl X1_Deathevery_X_ppl
## 1 332473823 North America 11 584
## 2 213704094 South America 16 645
## 3 1390271710 Asia 110 8411
## 4 65383138 Europe 14 676
## 5 145981988 Europe 32 1449
## 6 68156365 Europe 16 537
## X1_Testevery_X_ppl
## 1 1
## 2 7
## 3 6
## 4 1
## 5 1
## 6 1
## corona_data_updated$TotalCases/(corona_data_updated$Population/1e+06)
## 1 94521.625
## 2 60761.382
## 3 9081.064
## 4 73757.090
## 5 31439.084
## 6 64002.093
#View(cases_per_million_population)
#Which country ranks first based on total number of cases per million population and which country ranks last?
#country ranks first based on total number of cases per million population
cases_per_million_population <- corona_data_updated %>%
mutate(corona_data_updated$TotalCases / (corona_data_updated$Population /1000000 )) %>%
select(Country_Other, TotalCases, Population)%>% arrange(corona_data_updated)
cases_per_rankONE <- cases_per_million_population %>%
filter(TotalCases == max(TotalCases) ) %>%
arrange(Country_Other) %>%
head(10)
cases_per_rankONE
## Country_Other TotalCases Population
## 1 USA 31425966 332473823
#country ranks last based on total number of cases per million population
cases_per_rankLAST <- cases_per_million_population %>%
filter(TotalCases ==min(TotalCases) ) %>%
arrange(Country_Other) %>%
head(10)
cases_per_rankLAST
## Country_Other TotalCases Population
## 1 Wallis and Futuna 420 11089
corona_new <- current_status %>% select(TotalCases,TotalDeaths,TotalRecovered,rank )
#head(corona_new)
plot(corona_new$TotalCases,corona_new$TotalDeaths , type="b")

#head(corona_new)
boxplot(TotalDeaths ~ rank, corona_new, xlab = "rank", ylab = "TotalDeaths")

hist(corona_new$TotalCases)

with(corona_new, plot(TotalCases, TotalRecovered))

# adding title
with(corona_new, plot(TotalCases, TotalRecovered))
title(main = "TotalCases vs TotalRecovered")

# adding colour
with(corona_new, plot(TotalCases, TotalRecovered, main = "TotalCases vs TotalRecovered"))
with(subset(corona_new, rank == 1), points(TotalCases, TotalRecovered, col = "blue"))
with(subset(corona_new, rank == 0), points(TotalCases, TotalRecovered, col = "red"))

# Base Plot with Annotation
with(corona_new, plot(TotalCases, TotalRecovered, main = "TotalCases vs TotalRecovered", type = "n"))
with(subset(corona_new, rank == 1), points(TotalCases, TotalRecovered, col = "blue"))
with(subset(corona_new, rank == 0), points(TotalCases, TotalRecovered, col = "red"))
legend("topleft", pch = 1, col = c("blue", "red"), legend = c("1", "0"))

# Base Plot with Regression Line
with(corona_new, plot(TotalCases, TotalRecovered, main = "TotalCases vs TotalRecovered", pch = 20))
model <- lm(TotalCases ~ TotalRecovered, corona_new)
abline(model, lwd = 2)

# R base scatter plot: plot()
x <- corona_new$TotalCases
y <- corona_new$TotalDeaths
# Plot with main and axis titles
# Change point shape (pch = 19) and remove frame.
plot(x, y, main = "TotalCases vs TotalRecovered",
xlab = "TotalCases", ylab = "TotalRecovered",
pch = 5, frame = FALSE)

# Add regression line
plot(x, y, main = "TotalCases vs TotalRecovered",
xlab = "TotalCases", ylab = "TotalRecovered",
pch = 5, frame = FALSE)
abline(lm(y ~ x, data = corona_new), col = "blue")

# Add loess fit
plot(x, y, main = "TotalCases vs TotalRecovered",
xlab = "TotalCases", ylab = "TotalRecovered",
pch = 5, frame = FALSE)
lines(lowess(x, y), col = "blue")

library(ggplot2)
library("car")
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
scatterplot(TotalDeaths ~ TotalCases, data = corona_new)

# Suppress the smoother and frame
scatterplot(TotalDeaths ~ TotalCases, data = corona_new,
grid = TRUE, frame = FALSE)

scatterplot(TotalDeaths ~ TotalCases, data = corona_new,
grid = FALSE, frame = FALSE)

# smoother = FALSE,
# Scatter plot by groups ("cyl")
library(ggplot2)
scatterplot(TotalDeaths ~ TotalCases, data = corona_new,grid = FALSE, frame = FALSE)

scatterplot(TotalDeaths ~ TotalCases, data = corona_new,grid = FALSE, frame = TRUE)

# Scatter Plot Matrices - R Base Graphs
# Basic plots:
pairs(corona_new[,1:4], pch = 2)

# Show only upper panel:
pairs(corona_new[,1:4], pch = 19, lower.panel = NULL)

# Color points by groups (species)
my_cols <- c("#00AFBB", "#E7B800")
pairs(corona_new[,1:4], pch = 2, cex = 1,
col = my_cols[iris$Species],
lower.panel=NULL)

# Basic box plots
# Box plot of one variable
boxplot(corona_new$TotalRecovered)

# Box plots by groups (dose)
# remove frame
boxplot(corona_new$TotalDeaths ~ corona_new$TotalCases, data = corona_new, frame = FALSE)

# Horizontal box plots
boxplot(TotalDeaths ~ TotalCases, data = corona_new,frame = FALSE,
horizontal = TRUE)

# Notched box plots
boxplot(TotalDeaths ~ TotalCases, data = corona_new, frame = FALSE,
notch = TRUE)

# Change group names
#boxplot(TotalDeaths ~ TotalCases, data = corona_new, frame = FALSE, names = c("D0.5", "D1", "D2"))
# Change color
# Change the color of border using one single color
boxplot(TotalDeaths ~ TotalCases, data = corona_new, frame = FALSE,
border = "steelblue")

# Change the color of border.
# Use different colors for each group
boxplot(TotalDeaths ~ TotalCases, data = corona_new,frame = FALSE,
border = c("#999999", "#E69F00", "#56B4E9"))

# Change fill color : single color
boxplot(TotalDeaths ~ TotalCases, data = corona_new, frame = FALSE,
col = "steelblue")

# Change group names
#barplot(corona_new$TotalCases, names.arg = c("A", "B", "C"))
# Bar plot of one variable
barplot(corona_new$TotalCases)

# Horizontal bar plot
barplot(corona_new$TotalCases, horiz = TRUE)

# Line Plots - R Base Graphs
plot(x, y, type = "l", lty = 1)
lines(x, y, type = "l", lty = 1)

#### Plotly
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Scatter plot
fig <- plot_ly(data = corona_data_updated, x = ~corona_data_updated$TotalCases, y = ~corona_data_updated$TotalRecovered, marker = list(size = 10,
color = 'pink',
line = list(color = 'green',
width = 1)))
fig <- fig %>% layout(title = 'Customized Scatter Plot',
yaxis = list(zeroline = FALSE),
xaxis = list(zeroline = FALSE))
fig
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# head(data1)
fig <- data1 %>%
plot_ly(
x = ~data1$Tests_1M_pop,
y = ~Deaths_1M_pop,
size = ~Tests_1M_pop,
frame = ~Continent ,
text = ~Country_Other ,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
)
fig <- fig %>% layout(
xaxis = list(
type = "log"
)
)
fig
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
#fig <- plot_ly(x = ~data1$TotalCases, y = ~data1$TotalDeaths, z = ~data1$TotalRecovered,data=, type = #Data #Visualisation
fig <- plot_ly(data1, x = ~Tot_Cases_1M_pop, y = ~Continent, name = "TotalCases",
type = 'scatter', mode = "markers",
marker = list(color = "red", opacity = 0.4), size =5)%>%
add_trace(x = ~ActiveCases, y = ~Continent, name = "ActiveCases",
type ='scatter',mode = "markers", marker = list(color = "blue", opacity = 0.4)) %>%
layout(title = "Total Deaths vs Active vs Recovered cases")
fig
# Data Preparation
labels <- data1$Continent
values <- data1$Deaths_1M_pop
# Data Visualization
fig <- plot_ly(type='pie', labels=labels, values=values,
textinfo='label+percent',
insidetextorientation='radial') %>% layout(title = "Total deaths")
fig
# Data Preparation
labels <- data1$Continent
values <- data1$ActiveCases
# Data Visualization
fig <- plot_ly(type='pie', labels=labels, values=values,
textinfo='label+percent',
insidetextorientation='radial') %>% layout(title = "Total ActiveCases")
fig